/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.subcommands;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeSet;

import mosdi.discovery.EvaluatedPattern;
import mosdi.discovery.MotifFinder;
import mosdi.discovery.ObjectiveFunction;
import mosdi.discovery.objectives.AnnotationSumObjective;
import mosdi.discovery.objectives.OccurrenceCountObjective;
import mosdi.discovery.strategies.BruteForceSearch;
import mosdi.discovery.strategies.OptimumSearch;
import mosdi.discovery.strategies.ThresholdSearch;
import mosdi.fa.Alphabet;
import mosdi.fa.FiniteMemoryTextModel;
import mosdi.fa.IIDTextModel;
import mosdi.fa.MarkovianTextModel;
import mosdi.index.ScoreIndexCalculator;
import mosdi.index.SuffixTree;
import mosdi.index.SuffixTreeWalker;
import mosdi.paa.IndexSumDAA;
import mosdi.paa.PAA;
import mosdi.paa.TextBasedPAA;
import mosdi.util.BitArray;
import mosdi.util.Iupac;
import mosdi.util.IupacStringConstraints;
import mosdi.util.Log;
import mosdi.util.NamedSequence;
import mosdi.util.SequenceUtils;

public class IupacDiscoverySubcommand extends Subcommand {
	
	@Override
	public String usage() {
		return
		super.usage()+" [options] <pattern-length> <objectives> <fasta-file>\n" +
		"\n" +
		" General options\n" +
		"  -i: replace unknown characters in DNA sequences by $\n" +
		"\n" +
		" Controlling search STRATEGY:\n" +
		"  -T <pvalue-threshold>: Search for all motifs <= threshold (default: optimal motif only)\n" +
		"  -a: AND mode for option -T. All objectives (default: one objective) must be below threshold.\n" +
		"  -B: Perform brute force search: evaluate all motifs\n" +
		"\n" +
		" Controlling MOTIF SPACE to be searched:\n" +
		"  -m: minimum number of letters from {A,C,G,T},{W,S,R,Y,K,M},{B,D,H,V},{N}, default: \"0,0,0,0\"\n" +
		"  -M: maximum number of letters from {A,C,G,T},{W,S,R,Y,K,M},{B,D,H,V},{N}, default: \"<length>,0,0,<length>/2\"\n" +
		"  -l <lower-bound>: only consider patterns s.t. lower-bound<=pattern\n" +
		"  -u <upper-bound>: only consider patterns s.t. pattern<upper-bound\n" +
		"  -E <max-expectation>: upper bound for expectation (motifs with higher expectation\n" +
		"                       are not returned.)\n"+ 
		"  -r: consider reverse complement\n" +
		"\n" +
		" Allowed OBJECTIVES to be given as <objectives> (comma separated list):\n" +
		"   \"occ-count\": total number of occurrences\n" +
		"   \"annotation\": (EXPERIMENTAL) sum of annotations at motif positions\n" +
		"\n" +
		" Options for objective \"occ-count\"\n" +
//		"  -c <exp-clump-size-bound>: do not calculate upper bound for expected clump size dymanically,\n" +
//		"                             but use given threshold instead.\n" +
		"  -C <good-enough-bound>: when clump size bounds are computed dynamically, no new calculations\n" +
		"                          are made when a bound <=good_enough_bound is known (default: 1.01)\n"+
		"  -O <model-order>: use background model of given order (default: 0)\n" +
		"\n" +
		" Options for objective \"annotation\"\n" +
		"  -A <annotation.fa>: file containing an annotation track (mandatory)\n" +
		"  -b <bincount>: Number of bins for discretization of annotation scores (default: 11).\n" +
		"  -p <pseudocounts>: Pseudocounts for estimation of annotation model (default: 0.1).\n" +
		"  -R <order>: Order of annotation model (default: 2).\n" +
		"  -n: maximum number of occurrences, patterns that occur more often are not found.\n" +
		"      (default=1000)";
	}
	
	@Override
	public String description() {
		return "Runs motif discovery on (constrained) IUPAC patterns."; 
	}

	@Override
	public String name() {
		return "discovery";
	}

	@Override
	public int run(String[] args) {
		parseOptions(args, 3, "m:M:ac:BT:O:l:u:rE:C:iA:b:R:p:n:a");

		// Option dependencies
		exclusiveOptions("a", "T");
		exclusiveOptions("B", "T");

		// Mandatory arguments
		int patternLength = getIntArgument(0);
		String objectivesArgument = getStringArgument(1);
		String filename = getStringArgument(2);

		// Options
		final Alphabet iupacAlphabet = Alphabet.getIupacAlphabet();
		final Alphabet dnaAlphabet = Alphabet.getDnaAlphabet();
		boolean considerReverse = getBooleanOption("r", false);
		int[] minFrequencies = {0,0,0,0};
		minFrequencies = getRangedIntTupleOption("m", 4, 0, patternLength, minFrequencies);
		int[] maxFrequencies = {patternLength,0,0,patternLength/2};
		maxFrequencies = getRangedIntTupleOption("M", 4, 0, patternLength, maxFrequencies);
		double pValueThreshold = getRangedDoubleOption("T", 0.0, 1.0, -1.0);
		boolean pValueThresholdAndMode = getBooleanOption("a", false);
		boolean bruteForce = getBooleanOption("B", false);
		int modelOrder = getNonNegativeIntOption("O", 0);
		int[] lowerBound = getStringOption("l", iupacAlphabet, null);
		int[] upperBound = getStringOption("u", iupacAlphabet, null);
		double maxExpectation = getRangedDoubleOption("E", 0.0, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
		double goodEnoughClumpSizeBound = getRangedDoubleOption("C", 1.0, Double.POSITIVE_INFINITY, 1.01);
		boolean replaceUnknownByMinusOne = getBooleanOption("i", false);
		String annotationFilename = getStringOption("A", null);
		int annotationBinCount = getPositiveIntOption("b", 11);
		int annotationModelOrder = getNonNegativeIntOption("R", 2);
		double annotationPseudoCounts = getDoubleOption("p", 0.1);
		int maxOccurrences = getNonNegativeIntOption("n", 1000);
		
		Set<String> objectiveNames = null;
		List<NamedSequence> namedSequences = null;
		StringTokenizer st = new StringTokenizer(objectivesArgument, ",",false);
		objectiveNames = new TreeSet<String>();
		while (st.hasMoreTokens()) {
			String token = st.nextToken();
			if (token.equals("occ-count") || token.equals("annotation")) {
				objectiveNames.add(token);
			} else {
				Log.errorln("Unknown objective: "+token);
				System.exit(1);
			}
		}
		if (objectiveNames.size()==0) {
			Log.errorln("No objective function specified!");
			System.exit(1);
		}
		if (objectiveNames.contains("annotation") && (annotationFilename==null)) {
			Log.errorln("Option -A is mandatory if \"annotation\" objective is used.");
			System.exit(1);
		}
		Log.startTimer();
		try{
			namedSequences = SequenceUtils.readFastaFile(filename, dnaAlphabet, replaceUnknownByMinusOne);
		} catch (Exception e) {
			Log.errorln(e.toString());
			System.exit(1);
		}
		Log.stopTimer("Reading FASTA file");
		int effectiveLength = 0;
		for (NamedSequence ns : namedSequences) effectiveLength+=ns.length()-patternLength+1;
		Log.startTimer();
		List<int[]> sequences = new ArrayList<int[]>(namedSequences.size());
		for (NamedSequence ns : namedSequences) sequences.add(ns.getSequence());
		FiniteMemoryTextModel textModel = (modelOrder==0)?new IIDTextModel(dnaAlphabet.size(), sequences):new MarkovianTextModel(modelOrder,dnaAlphabet.size(),sequences);
		Log.stopTimer("Estimating text model");
		
		SuffixTree suffixTree = new SuffixTree();
		// variables needed to optimize annotation sum
		int[] scoreMaximumAnnotation = null;
		int[] scoreSumAnnotation = null;
		double[] scoreDistribution = null;
		FiniteMemoryTextModel annotationModel = null;
		ScoreIndexCalculator scoreIndexCalculator = null;
		if (objectiveNames.contains("annotation")) {
			Log.startTimer();
			SequenceUtils.readAnnotationTrack("annotation0", annotationFilename, namedSequences, annotationBinCount);
			Log.stopTimer("Reading annotation track");
			int[] concatAnnotation = SequenceUtils.concatAnnotations(namedSequences, "annotation0", considerReverse);
			scoreIndexCalculator = new ScoreIndexCalculator(concatAnnotation, patternLength);
			suffixTree.addNodeConstructionListener(scoreIndexCalculator);
			List<int[]> annotations = new ArrayList<int[]>(namedSequences.size());
			for (NamedSequence ns : namedSequences)	annotations.add(ns.getAnnotationTrack("annotation0"));
			if (annotationModelOrder==0) {
				annotationModel = new IIDTextModel(annotationBinCount, annotations);
			} else {
				annotationModel = new MarkovianTextModel(annotationModelOrder, annotationBinCount, annotations, annotationPseudoCounts);
			}
			Log.startTimer();
			IndexSumDAA daa = new IndexSumDAA(annotationModel.getAlphabetSize(), patternLength*(annotationBinCount-1));
			PAA paa = new TextBasedPAA(daa, annotationModel);
			scoreDistribution = paa.computeValueDistribution(patternLength);
			Log.stopTimer("Computing distribution of annotation sums");
		}
		// allow sequences to be garbage collected
		sequences = null;
		int[] concatSequence = SequenceUtils.concatSequences(namedSequences, considerReverse);
		namedSequences = null;
		suffixTree.buildTree(concatSequence, dnaAlphabet.size(), patternLength);
		concatSequence = null;
		if (objectiveNames.contains("annotation")) {
			scoreMaximumAnnotation = scoreIndexCalculator.getScoreMaximumAnnotation();
			scoreSumAnnotation = scoreIndexCalculator.getScoreSumAnnotation();
		}
		if (Log.levelAtLeast(Log.Level.DEBUG)) {
			SuffixTreeWalker walker = suffixTree.getWalker();
			BitArray wildcard = new BitArray(dnaAlphabet.size());
			wildcard.invert();
			for (int i=1; i<=patternLength; ++i) {
				int[] nodes = walker.forward(wildcard);
				Log.printf(Log.Level.DEBUG, "Nodes for prefix length %d: %,d (memory: %,d bytes)\n",i,nodes.length,nodes.length*suffixTree.getNodeSize());
			}
		}
		int[] occurrenceCountAnnotation = null;
		if (objectiveNames.contains("occ-count") || objectiveNames.contains("annotation")) {
			occurrenceCountAnnotation = suffixTree.calcOccurrenceCountAnnotation();
		}
		
		Log.startTimer();
//		FactorialCalculator fc = new FactorialCalculator(10000);
		BitArray[] generalizedAlphabet = Iupac.toGeneralizedString("ABCDGHKMNRSTVWY").getPositions();
		MotifFinder motifFinder = new MotifFinder(suffixTree, generalizedAlphabet, considerReverse);
		
		// IidPValueSearch search = new IidPValueSearch(sequences, textModel.getCharacterDistribution(), pValueThreshold, occurrenceCountAnnotation, false, expectedClumpSizeBound);
		int[] maxMatchCountAnnotation = suffixTree.calcMaxMatchCountAnnotation(patternLength);
		
		List<ObjectiveFunction> objectives = new ArrayList<ObjectiveFunction>();
		for (String name : objectiveNames) {
			if (name.equals("occ-count")) {
				OccurrenceCountObjective o = new OccurrenceCountObjective(textModel, occurrenceCountAnnotation, maxMatchCountAnnotation, effectiveLength);
				o.setGoodEnoughClumpSizeBound(goodEnoughClumpSizeBound);
				if (!Double.isInfinite(maxExpectation)) o.setMaxExpectation(maxExpectation);
				objectives.add(o);
			}
			if (name.equals("annotation")) {
				AnnotationSumObjective o = new AnnotationSumObjective(maxOccurrences, scoreDistribution, occurrenceCountAnnotation, scoreMaximumAnnotation, scoreSumAnnotation);
				objectives.add(o);
			}
		}
		
		MotifFinder.SearchSpecification search = null;
		if (bruteForce) {
			if (Log.levelAtLeast(Log.Level.DEBUG)) {
				search = new BruteForceSearch(objectives, iupacAlphabet, System.out);
			} else {
				search = new BruteForceSearch(objectives, iupacAlphabet, null);
			}
		} else if (pValueThreshold!=-1.0) {
			search = new ThresholdSearch(objectives,  pValueThreshold, System.out, pValueThresholdAndMode);
		} else {
			search = new OptimumSearch(objectives);
		}
		Log.startTimer();
		motifFinder.findIupacPatterns(patternLength, new IupacStringConstraints(minFrequencies,maxFrequencies), search, lowerBound, upperBound);
		Log.stopTimer("Search for IUPAC pattern on suffix tree");
		double timeMotifDiscovery = Log.getLastPeriodCpu();
		Log.print(Log.Level.VERBOSE, "------------------- RESULTS -------------------\n");
		Log.println(Log.Level.STANDARD, "Format: >> pattern >objective> score minus-log-pvalue pvalue");
		for (EvaluatedPattern m : search.getResults()) {
			// StringBuilder sb = new StringBuilder();
			// sb.append(String.format(">>p_value>> %e LIN ", m.getPValue()));
			// sb.append(String.format(">>stats>> %s %d ", (m.getPattern()==null)?"null":iupacAlphabet.buildString(m.getPattern()), m.getScore()));
			Log.println(Log.Level.STANDARD, m.toString(iupacAlphabet, objectives));
		}
		//Log.printf(Log.Level.STANDARD, ">>> %s %d %d %t %e%n", search.getMotifsEvaluated(), (long)Math.round(Math.exp(logNumber)), timeInstanceSearch, bestPValue);
		Log.printf(Log.Level.VERBOSE, ">>!>total_time>>: %t %n", timeMotifDiscovery);
		return 0;
	}
}
